------------------------------------------------------------------------------------------------------------------------------------------
      name:  <unnamed>
       log:  E:\REStat_MS14767_Vol96(2)\Data preparation Compustat segment\4_segmentvars.log
  log type:  text
 opened on:  18 Dec 2014, 18:05:19

. 
. 
. ********************************************************************************
. ********                                                              **************
. ********  in this file we create a sample of compustat variables  **************
. ********                compustatvars.dta                               **************
. ********************************************************************************
. 
. 
. use "compustat_master_america.dta", clear

. so ticker year

. keep  ticker vantagekey name countryinc employees sales forn_sal ta year SICALL2-SICALL15 SIC4 CUSIP rdexpense mkvalq

. rename CUSIP cusip

. 
.                 **********************************************************
.                 *****                VERY IMPORTANT STEP             *****
.                 *****           WE KEEP ONLY AMERICAN FIRMS          *****
.                 *****             FOR WHICH WE HAVE SALES            *****
.                 **********************************************************
. 
. * sales are changed into sales_new by substracting forn_sales
. replace forn_sales=0 if  forn_sales==.
(279952 real changes made)

. gen sales_new=sales- forn_sales
(169202 missing values generated)

. count if sales_new<0
   84

. replace sales_new=. if sales_new<0
(84 real changes made, 84 to missing)

. count if sales_new==.
169286

. drop if sales_new==.
(169286 observations deleted)

. label var sales_new "Sales - foreign sales"

. 
. compress
  ticker was str8 now str7
  (112,562 bytes saved)

. drop countryinc vantagekey sales forn_sales

. 
. 
. ********************here we merge this dataset with the segment data***********
. ******************and do some checks:
. 
. merge 1:m ticker year using "segmentdata.dta"
(note: variable ticker was str7, now str8 to accommodate using data's values)
(note: variable year was int, now float to accommodate using data's values)

    Result                           # of obs.
    -----------------------------------------
    not matched                        82,494
        from master                    31,122  (_merge==1)
        from using                     51,372  (_merge==2)

    matched                           119,275  (_merge==3)
    -----------------------------------------

. 
. 
.                 /*
>                 keep if _merge ==2
>                 sort  ticker
>                 drop if ticker[_n] ==ticker[_n-1]
>                 keep ticker
>                 merge 1:m ticker using "compustatvars_1.dta"
> 
>                 use "compustatvars_2.dta", clear
>                 keep if _merge == 1
>                 sort ticker 
>                 drop  if  SICALL3 ==.
>                 count if ticker[_n] != ticker[_n-1]
> 
>                 use "compustatvars_2.dta", clear
>                 keep if _merge ==3
>                 sort ticker
>                 drop if ticker[_n] ==ticker[_n-1]
>                 count if  cusip !=  cusip_segm
>                 */
. 
. 
. * Here we drop the observations wich have several SIC-codes in the 
. *compustat data set, but are not included in the segment data set.              
. drop  if  SICALL3 !=.&_merge ==1
(12049 observations deleted)

. 
. drop SICALL*

. 
. replace sales_segm = sales_new if _merge ==1
(19073 real changes made)

. destring sics1, replace
sics1 has all characters numeric; replaced as int
(19073 missing values generated)

. replace sics1 = SIC4 if _merge ==1
(19073 real changes made)

. replace ta_segm = ta if _merge ==1
(18942 real changes made)

. replace rds_segm = rdexpense if _merge ==1
(6848 real changes made)

. 
. gen dataset = "segm"

. replace dataset = "comp" if _merge ==1
(19073 real changes made)

. replace dataset ="inboth" if _merge ==3
dataset was str4 now str6
(119275 real changes made)

. label var dataset "Indicates the origin of Obvervation"

. 
. ***************************************************************************
. * Now the cusip identifier is taken from the the segment data
. * 
. 
. gen cusip9 = cusip_segm
(19073 missing values generated)

. replace cusip9 =cusip if cusip9 == ""
(19073 real changes made)

. 
. rename name name_comp

. rename conm name_segm

. gen name = name_segm
(19073 missing values generated)

. replace name = name_comp if name ==""
(19073 real changes made)

. 
. 
. 
. 
. ****************************************************************************
. * It occures that the cusip Identifier is the same for different ticker from the
. * compustat and the segment data. Example:
. * ticker        cusip9  year    dataset
. ***AHN  00209C102       1986    comp
. ***AHNA 00209C102       1986    segm
. * since the segment data is our base dataset, we keep the ticker from the Segment data: 
. ****************************************************************************
. 
. sort cusip9 ticker dataset

. gen j = 1 if cusip9==cusip9[_n-1]& ticker !=ticker[_n-1]
(189515 missing values generated)

. egen k = max(j), by(cusip9)
(186553 missing values generated)

. gen ticker2 = ticker if k ==1 & dataset =="comp"
(188320 missing values generated)

. sort cusip9 dataset

. replace ticker2 = ticker2[_n-1] if ticker2=="" & k == 1
(1767 real changes made)

. drop if k == 1 & dataset == "comp"
(1400 observations deleted)

. 
. drop j k

. 
. ****************************************************************************
. * It occures that the firms Name is the same for different ticker from the
. * compustat and the segment data. Example:
. *name                                           ticker  cusip9          year    dataset
. *ADAPTIVE BROADBAND CORP.       ADAP    00650M104       1986    segm
. *ADAPTIVE BROADBAND CORP.       ADAPQ   00650M10X       1986    comp
. * since the segment data is our base dataset, we keep the cusip from the Segment data: 
. ****************************************************************************
. 
. 
. sort name ticker dataset

. gen j = 1 if name==name[_n-1]& ticker !=ticker[_n-1]
(186920 missing values generated)

. egen k = max(j), by(name)
(164937 missing values generated)

. replace ticker2 = ticker if k ==1 & dataset =="comp"
(9484 real changes made)

. sort name dataset

. replace ticker2 = ticker2[_n-1] if ticker2=="" & k == 1
(13899 real changes made)

. 
. drop if k == 1 & dataset == "comp"
(9484 observations deleted)

. replace ticker2 = ticker if ticker2 == ""
ticker2 was str6 now str8
(163170 real changes made)

. drop j k

. 
. * this step is yust a consistency chek.
. sort year ticker2

. gen g = 1 if ticker2 == ticker2[_n-1] & ticker!= ticker[_n-1]
(178710 missing values generated)

. egen x = max(g), by(ticker2)
(178195 missing values generated)

. drop if x == 1& dataset == "inboth"
(155 observations deleted)

. drop g x ticker2

. 
. 
. 
. egen s = seq(), by(year ticker sics1)

. count if s !=1
 8750

. 
. ****************************************************************************
. *if the firms reports different segments and Compustat assign them the
. *same SIC code we aggregate them:
. ****************************************************************************
. 
. egen sales = total(sales_segm), by(year ticker sics1)

. label var sales "Sales in segment, in thousands"

. egen ta_new = total(ta_segm), by(year ticker sics1)

. label var ta_new "Total Asset in segment, in thousands"

. egen rd_new = total(rds_segm), by(year ticker sics1)

. label var rd_new "R&D Expenses in segment, in thousands"

. keep if s ==1
(8750 observations deleted)

. replace ta_new = . if ta_segm ==.
(4224 real changes made, 4224 to missing)

. replace rd_new = . if rds_segm==.
(116429 real changes made, 116429 to missing)

. drop sales_segm ta_segm rds_segm

. 
. ****************************************************************************
. *Firms from the segment data which only have one segment reported:
. *The missing of r&d and ta can be replaced by available entries from the 
. * compustat data:
. ****************************************************************************
.   
. egen single = total(s), by(year ticker)

. replace rd_new = rdexpense if single ==1 & _merge == 3 & rd_new ==.
(820 real changes made)

. replace ta_new = ta if single ==1 & _merge == 3 & ta_new ==.
(279 real changes made)

. drop s single

. 
. ****************************************************************************
. *Since total asset und R&D are often missing in
. *the segment file we produce two alternative variables (TAcorr
. *and R&Dcorr) which are defined as the proportion of TA and R&D from the
. *main file according to the sales figures from the segment file.
. ****************************************************************************
. 
. *This appraoch only makes sense for observations, where merge ==3,
. * since only then we have the ta and rd information from the main file
. * and the sales from the segment data:
. egen  sales_tot = total(sales), by(year ticker)

. gen sales_prop = sales/sales_tot if sales_tot !=0
(3654 missing values generated)

. 
. label var sales_prop "Proportion of total sales in segment"

. drop sales_tot

. 
. egen ta_sum = max(ta) if _merge ==3 & ta_new==., by(year ticker)
(168810 missing values generated)

. gen ta_corr = ta_sum*sales_prop
(168812 missing values generated)

. replace ta_corr = ta_new if ta_corr == .
(165986 real changes made)

. drop ta_sum

. 
. 
. egen rd_sum = max(rdexpense) if _merge ==3 & rd_new==. , by(year ticker)
(154683 missing values generated)

. gen rd_corr = rd_sum*sales_prop
(154695 missing values generated)

. replace rd_corr = rd_new if rd_corr == .
(54322 real changes made)

. drop rd_sum ta SIC4 mkvalq rdexpense gvkey employees sales_new  name_segm cusip_segm _merge name_comp cusip

. rename sics1 SIC

. 
. rename cusip9 cusip

. 
. gen SIC2=int(SIC/100)

. label var SIC2 "SIC4/100"

. 
. save "compustat+segment.dta", replace
file compustat+segment.dta saved

. 
. ********************* we correct some variables  ****************
. ********************* and generate some others     ****************
. 
. so year ticker SIC

. 
. egen firmnum=group(ticker)

. label var firmnum "group(ticker)"

. egen ncomp=count(firmnum), by(SIC year)

. label var ncomp "number of competitors per year/SIC4"

. drop firmnum

. 
. ********************************************************
. *****           
. *****           here we define market shares (our dep. var.) defining the market as the SIC4 industry
. *****           
. ********************************************************
. 
. 
. egen tot_sales=sum(sales), by (SIC year)

. label var tot_sales "sum of sales by SIC, year"

. gen MS=sales/tot_sales
(33 missing values generated)

. label var MS "Market shares firm/year at the SIC level"

. 
. gen aa=MS*MS
(33 missing values generated)

. egen HHI=sum(aa), by(SIC year)

. label var HHI "HHI index at the SIC level"

. drop aa

. 
. so ticker year SIC

. save "segment_compustatvars.dta", replace
file segment_compustatvars.dta saved

. 
. ********************************************************
. *****           
. *****   We have to make the dataset wide such that we have 
. *****   only one entry per firm/year.
. *****   This becomes important when merging it to the RJY dataset
. *****           
. ********************************************************
. 
. keep ticker  year SIC MS dataset 

. egen nr_sic = seq(), by(year ticker) 

. reshape wide SIC MS, i(ticker year) j(nr_sic)
(note: j = 1 2 3 4 5 6 7 8 9 10)

Data                               long   ->   wide
-----------------------------------------------------------------------------
Number of obs.                   169931   ->  122423
Number of variables                   6   ->      23
j variable (10 values)           nr_sic   ->   (dropped)
xij variables:
                                    SIC   ->   SIC1 SIC2 ... SIC10
                                     MS   ->   MS1 MS2 ... MS10
-----------------------------------------------------------------------------

. sort  year ticker

. count if ticker[_n] == ticker[_n-1]
    0

. *0
. 
. save "segment_wide.dta", replace
file segment_wide.dta saved

. 
. keep ticker year

. 
. save "ticker_year.dta", replace
file ticker_year.dta saved

. log close
      name:  <unnamed>
       log:  E:\REStat_MS14767_Vol96(2)\Data preparation Compustat segment\4_segmentvars.log
  log type:  text
 closed on:  18 Dec 2014, 18:06:08
------------------------------------------------------------------------------------------------------------------------------------------
